import pandas as pd
import math
import numpy as np
import numpy as np
import pandas as pd
import shapefile as shp
import matplotlib.pyplot as plt
import seaborn as sns
fips_df = pd.read_csv('data/fips2county.tsv', sep='\t', header='infer', dtype=str, encoding='latin-1')
cancer_df = pd.read_csv('data/cancer_reg.csv', encoding='latin-1')
# add a new column 'Target_div_Income'
cancer_df['Target_div_Income'] = cancer_df['TARGET_deathRate'] / cancer_df['medIncome']
# extract state and county from 'Geography' column and create new columns
cancer_df[['County', 'State']] = cancer_df['Geography'].str.extract(r'(.+), (.+)')
cancer_df['County'] = cancer_df['County'].str.replace(' County', '')
# manually change two county names
cancer_df.loc[166, 'County'] = 'Dona Ana County'
cancer_df.loc[820, 'County'] = 'La Salle Parish'
# merge the dataframes to get the FIPS codes
cancer_df = pd.merge(cancer_df, fips_df,
left_on=['County'], right_on=['CountyName'], how='left')
# add a new column 'Target_div_LogIncome'
cancer_df['Target_div_LogIncome'] = cancer_df['TARGET_deathRate'] / \
(cancer_df['medIncome'].apply(lambda x: math.log(x)))
cancer_df
# create a new dataframe for graphdata with columns 'fips' and 'values'
graphdata = pd.DataFrame({'fips': cancer_df['CountyFIPS'],
'values': cancer_df['Target_div_LogIncome'],
'CountyFIPS': cancer_df['CountyFIPS']})
# create a new dataframe newbieLOG with column 'anomalies'
newbieLOG = graphdata.copy()
newbieLOG['anomalies'] = (newbieLOG['values'] - newbieLOG['values'].mean()) / newbieLOG['values'].std()
newbieLOG['anomalies'] = np.where((newbieLOG['anomalies']) > 1, (newbieLOG['anomalies']), 0)
newbieLOG = newbieLOG[['fips', 'anomalies']]
newbieLOG
| fips | anomalies | |
|---|---|---|
| 0 | 53035 | 0.000000 |
| 1 | 53037 | 0.000000 |
| 2 | 53039 | 0.000000 |
| 3 | 16061 | 0.000000 |
| 4 | 21135 | 0.000000 |
| ... | ... | ... |
| 14223 | 48159 | 0.000000 |
| 14224 | 50011 | 0.000000 |
| 14225 | 51067 | 0.000000 |
| 14226 | 53021 | 0.000000 |
| 14227 | 20061 | 1.140201 |
14228 rows × 2 columns
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
import plotly.express as px
fig = px.choropleth_mapbox(newbieLOG, geojson=counties, locations='fips', color='anomalies',
color_continuous_scale="Hot_r",
range_color=(0, 4),
mapbox_style="carto-positron",
zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
opacity=0.5,
labels={'unemp':'unemployment rate'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
cancer_df.columns
Index(['avgAnnCount', 'avgDeathsPerYear', 'TARGET_deathRate', 'incidenceRate',
'medIncome', 'popEst2015', 'povertyPercent', 'studyPerCap', 'binnedInc',
'MedianAge', 'MedianAgeMale', 'MedianAgeFemale', 'Geography',
'AvgHouseholdSize', 'PercentMarried', 'PctNoHS18_24', 'PctHS18_24',
'PctSomeCol18_24', 'PctBachDeg18_24', 'PctHS25_Over',
'PctBachDeg25_Over', 'PctEmployed16_Over', 'PctUnemployed16_Over',
'PctPrivateCoverage', 'PctPrivateCoverageAlone', 'PctEmpPrivCoverage',
'PctPublicCoverage', 'PctPublicCoverageAlone', 'PctWhite', 'PctBlack',
'PctAsian', 'PctOtherRace', 'PctMarriedHouseholds', 'BirthRate',
'Target_div_Income', 'County', 'State', 'StateFIPS', 'CountyFIPS_3',
'CountyName', 'StateName', 'CountyFIPS', 'StateAbbr', 'STATE_COUNTY',
'Target_div_LogIncome'],
dtype='object')
import pandas as pd
import pandas_bokeh
import matplotlib.pyplot as plt
import pgeocode
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
pandas_bokeh.output_notebook()
import plotly.graph_objects as go
df_race = cancer_df[['TARGET_deathRate', 'medIncome', 'PctWhite', 'PctBlack', 'PctAsian', 'PctOtherRace']]
df_race = df_race.melt(id_vars=['TARGET_deathRate', 'medIncome'], var_name='variable')
def get_variable_group(variable):
if variable == 'pctwhite':
return 'White'
elif variable == 'pctblack':
return 'Black'
elif variable == 'pctasian':
return 'Asian'
else:
return 'Other'
df_race['variable_group'] = df_race['variable'].apply(get_variable_group)
colors = ['red', 'blue', 'green', 'purple']
color_map = dict(zip(df_race['variable_group'].unique(), colors))
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
for ax, variable in zip(axes.flatten(), df_race['variable'].unique()):
data = df_race[df_race['variable'] == variable]
ax.scatter(data['value'], data['TARGET_deathRate'], c=data['variable_group'].apply(lambda x: color_map[x]), s=data['medIncome']/5000, alpha=0.7)
ax.set_xlabel('Percentage of population by race')
ax.set_ylabel('Target death rate')
ax.set_title(variable)
ax.set_ylim([100, 400])
plt.suptitle('Impact of race and income on target death rate', fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.92)
plt.show()